In [1]:
import pandas as pd
import numpy as np
import sys
%matplotlib inline

In [2]:
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__


Python version 2.7.5 |Anaconda 2.1.0 (64-bit)| (default, Jul  1 2013, 12:37:52) [MSC v.1500 64 bit (AMD64)]
Pandas version 0.15.2

Select

How do I select a random sample of a group?


In [3]:
# Initial dataframe
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'value'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df


Out[3]:
group1 group2 value
0 a 1 apple
1 b 2 pear
2 a 3 orange
3 a 4 apple
4 b 1 banana
5 c 3 durian
6 c 5 lemon
7 c 6 lime
8 c 5 raspberry
9 c 4 durian
10 a 1 peach
11 a 2 nectarine
12 a 3 banana
13 b 4 lemon
14 b 3 guava
15 b 2 blackberry
16 b 1 grape

In [4]:
# We don't simply want to select random rows from df
# We want to first group df by (group1 & group2) then select random rows
from random import choice

# First create the group
grouped = df.groupby(['group1','group2'])
grouped.size()


#Notice that group (a,1) has two posibilities
#Notice that group (a,2) has one posibilities

#This means that if we select a random sample from group (a,1) we will get either "apple" or "peach"
#This means that if we select a random sample from group (a,2) we will always get "nectarine"


Out[4]:
group1  group2
a       1         2
        2         1
        3         2
        4         1
b       1         2
        2         2
        3         1
        4         1
c       3         1
        4         1
        5         2
        6         1
dtype: int64

In [5]:
#df.loc[select a random record from each group]
df.loc[(choice(x) for x in grouped.groups.itervalues())]


Out[5]:
group1 group2 value

How do I slice each row of a column?


In [6]:
df = pd.DataFrame(data=['abcdef']*10, columns=['text'])
df


Out[6]:
text
0 abcdef
1 abcdef
2 abcdef
3 abcdef
4 abcdef
5 abcdef
6 abcdef
7 abcdef
8 abcdef
9 abcdef

In [7]:
# Select the first 2 characters of each row
df['text'].apply(lambda x: x[:2])


Out[7]:
0    ab
1    ab
2    ab
3    ab
4    ab
5    ab
6    ab
7    ab
8    ab
9    ab
Name: text, dtype: object

How can I select rows of my dataframe based on a "complex" filter applied to multiple columns?


In [8]:
d = {'Dates':[pd.Timestamp('2013-01-02'),
              pd.Timestamp('2013-01-03'),
              pd.Timestamp('2013-01-04')],
     'Num1':[1,2,3],
     'Num2':[-1,-2,-3]}
                 

df = pd.DataFrame(data=d)
df


Out[8]:
Dates Num1 Num2
0 2013-01-02 1 -1
1 2013-01-03 2 -2
2 2013-01-04 3 -3

In [9]:
# where all values in column "Num1" are positive
positive = df['Num1'] > 0

# where values in column "Num2" is equal to -1
negativeOne = df['Num2'] == -1

# where values in the column "Dates" are in (1/2/2013 or 1/20/2013)
Dates = df['Dates'].isin(['2013-01-02','2013-01-20'])

df[positive & negativeOne & Dates]


Out[9]:
Dates Num1 Num2
0 2013-01-02 1 -1

How to get the maximum value of a group?


In [10]:
df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
                'col2':[10,20,30,40],
                'col3':[-10,-20,30,np.nan]
                })
df


Out[10]:
col1 col2 col3
0 minus 10 -10
1 minus 20 -20
2 positive 30 30
3 nan 40 NaN

In [11]:
# Method 1
df.groupby('col1').apply(lambda x: x.max())


Out[11]:
col1 col2 col3
col1
minus minus 20 -10
nan NaN 40 NaN
positive positive 30 30

In [12]:
# Method 2
df.groupby('col1').agg('max')


Out[12]:
col2 col3
col1
minus 20 -10
nan 40 NaN
positive 30 30

How to select records from one level of a multi-index data frame?


In [13]:
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'group2'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df


Out[13]:
value
group1 group2
a apple 1
b pear 2
a orange 3
apple 4
b banana 1
c durian 3
lemon 5
lime 6
raspberry 5
durian 4
a peach 1
nectarine 2
banana 3
b lemon 4
guava 3
blackberry 2
grape 1

In [14]:
df.xs('a', level='group1')


Out[14]:
value
group2
apple 1
orange 3
apple 4
peach 1
nectarine 2
banana 3

How do I reset the index when the index names are the same as the column names?


In [15]:
df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , 
                "City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
               )
df


Out[15]:
City Name
0 Seattle Alice
1 Seattle Bob
2 Portland Mallory
3 Seattle Mallory
4 Seattle Bob
5 Portland Mallory

In [16]:
group = df.groupby(['City','Name'])
s = group.agg('size')
s.add_suffix('_size').reset_index()


Out[16]:
City Name 0
0 Portland_size Mallory_size 2
1 Seattle_size Alice_size 1
2 Seattle_size Bob_size 2
3 Seattle_size Mallory_size 1

Author: David Rojas